{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Import Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import emoji\n",
    "import re\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics import classification_report\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Retrieve Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "f1 = open('tweets.json')\n",
    "data_tweets = json.load(f1).items()\n",
    "user_label = pd.read_csv('labeled_users_clean.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Unnamed: 0  pic_id               id       user_id lang            location  \\\n",
      "0           0       0    _____zac_____  4.614412e+08   en       Maryland, USA   \n",
      "1           1       1         ___aleia  7.650000e+17   en           Ohio, USA   \n",
      "2           2       3  ___schaeffer___  1.257110e+09   en             The Lou   \n",
      "3           3       9    __EmilyRice__  3.797155e+09   en    Marble Falls, TX   \n",
      "4           4      10      __ginaaaa__  1.941566e+09   en  West Virginia, USA   \n",
      "\n",
      "   protected  followers_count  friends_count  statuses_count  \\\n",
      "0          0         0.000419       0.002298        0.060043   \n",
      "1          0         0.000948       0.005010        0.007060   \n",
      "2          0         0.001656       0.008522        0.057368   \n",
      "3          0         0.000316       0.002453        0.002550   \n",
      "4          0         0.001410       0.005785        0.029273   \n",
      "\n",
      "   favourites_count  account_created_at  verified  \\\n",
      "0          0.010963            0.361569       0.0   \n",
      "1          0.061531            0.714660       0.0   \n",
      "2          0.189428            0.450789       0.0   \n",
      "3          0.006811            0.648580       0.0   \n",
      "4          0.054823            0.495037       0.0   \n",
      "\n",
      "   num.tweets.used.Lexicon.prediction  Lexicon.age.prediction  \\\n",
      "0                            0.615385                0.212996   \n",
      "1                            0.615385                0.174979   \n",
      "2                            0.328671                0.297446   \n",
      "3                            0.615385                0.186579   \n",
      "4                            0.615385                0.256006   \n",
      "\n",
      "   Lexicon.gender.prediction..index.  lexicon.gender.prediction  \\\n",
      "0                           0.374761                        1.0   \n",
      "1                           0.556965                        0.0   \n",
      "2                           0.215564                        1.0   \n",
      "3                           0.661171                        0.0   \n",
      "4                           0.455204                        1.0   \n",
      "\n",
      "   human.labeled.age  age  \n",
      "0           0.160714  1.0  \n",
      "1           0.089286  0.0  \n",
      "2           0.142857  0.0  \n",
      "3           0.107143  0.0  \n",
      "4           0.142857  0.0  \n",
      "980\n"
     ]
    }
   ],
   "source": [
    "# Rename column for merging\n",
    "user_label.rename(columns={\"screen_name\": \"id\"}, inplace=True)\n",
    "print(user_label.head())\n",
    "print(len(user_label))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                id                                             tweets\n",
      "0    _____zac_____  [@AdvoBarryRoux @GetVidBot, The owner of drip ...\n",
      "1         ___aleia  [I haven’t talked to this girl since my sophom...\n",
      "2          ___Dals  [It come wit it 😭 https://t.co/ENyEtlphtP, @na...\n",
      "3  ___schaeffer___  [☝🏼👋🏼 https://t.co/xGJLlzLR7g, https://t.co/7N...\n",
      "4   __andresiscool  [Enough https://t.co/gLarVLIHxW, Some of the w...\n",
      "2678\n"
     ]
    }
   ],
   "source": [
    "data_tweets = pd.DataFrame.from_dict(data_tweets)\n",
    "data_tweets.rename(columns={0: \"id\", 1: \"tweets\"}, inplace=True)\n",
    "print(data_tweets[:5])\n",
    "print(len(data_tweets))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "980\n"
     ]
    }
   ],
   "source": [
    "# CSV data after cleaning and selection\n",
    "alldata = pd.merge(data_tweets, user_label, on='id')\n",
    "print(len(alldata))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract 3 useful columns ID, tweets, age\n",
    "idAndTweets = alldata[[\"id\",\"tweets\",\"age\"]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean data - remove mentions, urls, hashtags, emoji, punctuations, special chars, stop words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/francischen/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._set_item(key, value)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tweets</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>_____zac_____</td>\n",
       "      <td>[' ', \"The owner of drip doesn't even have 100...</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>___aleia</td>\n",
       "      <td>['I haven’t talked to this girl since my sopho...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>___schaeffer___</td>\n",
       "      <td>['  '  '  ' 37-14-9', ' congrats sis keep work...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>__EmilyRice__</td>\n",
       "      <td>[' yes but come to san marcos and live with me...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>__ginaaaa__</td>\n",
       "      <td>[' small :)', 'Go get ready for dinner.  '  JA...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                id                                             tweets  age\n",
       "0    _____zac_____  [' ', \"The owner of drip doesn't even have 100...  1.0\n",
       "1         ___aleia  ['I haven’t talked to this girl since my sopho...  0.0\n",
       "2  ___schaeffer___  ['  '  '  ' 37-14-9', ' congrats sis keep work...  0.0\n",
       "3    __EmilyRice__  [' yes but come to san marcos and live with me...  0.0\n",
       "4      __ginaaaa__  [' small :)', 'Go get ready for dinner.  '  JA...  0.0"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#remove mentions, urls, hashtags.\n",
    "regexMap = {r\"@[\\w]+\": \"\", r\"http[\\S]+\": \"\", r\"#[\\w]+\": \"\"}\n",
    "\n",
    "def cleaning(data):\n",
    "    t = data\n",
    "    for regx in regexMap.keys():\n",
    "        t = re.sub(regx, regexMap[regx], str(t))\n",
    "    return t\n",
    "\n",
    "#remove emojis\n",
    "def deEmojify(data):\n",
    "    t = data\n",
    "    return emoji.get_emoji_regexp().sub('', str(t))\n",
    "\n",
    "\n",
    "idAndTweets[\"tweets\"] = idAndTweets[\"tweets\"].apply(cleaning)\n",
    "idAndTweets[\"tweets\"] = idAndTweets[\"tweets\"].apply(deEmojify)\n",
    "idAndTweets.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-18-691668277b31>:2: FutureWarning: The default value of regex will change from True to False in a future version.\n",
      "  idAndTweets['tweets'] = idAndTweets['tweets'].str.replace(\"[^a-zA-Z#]\", \" \")\n",
      "/Users/francischen/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._set_item(key, value)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tweets</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>_____zac_____</td>\n",
       "      <td>The owner of drip doesn t even have    ...</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>___aleia</td>\n",
       "      <td>I haven t talked to this girl since my sopho...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>___schaeffer___</td>\n",
       "      <td>congrats sis keep work...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>__EmilyRice__</td>\n",
       "      <td>yes but come to san marcos and live with me...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>__ginaaaa__</td>\n",
       "      <td>small       Go get ready for dinner      JA...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>__masonsmith__</td>\n",
       "      <td>pain    b g ten basketball is dead     ...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>__sammybear__</td>\n",
       "      <td>sugar daddies or Venmo send me money pl...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>_AJoseph_</td>\n",
       "      <td>What a dumb ass petty islanders tweet       ...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>_ashleelyon</td>\n",
       "      <td>I just registered to save a life with   You ...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>_ashleyshaffer</td>\n",
       "      <td>The All American soundtrack is pretty fuckin...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                id                                             tweets  age\n",
       "0    _____zac_____         The owner of drip doesn t even have    ...  1.0\n",
       "1         ___aleia    I haven t talked to this girl since my sopho...  0.0\n",
       "2  ___schaeffer___                          congrats sis keep work...  0.0\n",
       "3    __EmilyRice__     yes but come to san marcos and live with me...  0.0\n",
       "4      __ginaaaa__     small       Go get ready for dinner      JA...  0.0\n",
       "5   __masonsmith__         pain    b g ten basketball is dead     ...  0.0\n",
       "6    __sammybear__         sugar daddies or Venmo send me money pl...  0.0\n",
       "7        _AJoseph_    What a dumb ass petty islanders tweet       ...  0.0\n",
       "8      _ashleelyon    I just registered to save a life with   You ...  0.0\n",
       "9   _ashleyshaffer    The All American soundtrack is pretty fuckin...  0.0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# removing punctuations, numbers, and special characters\n",
    "idAndTweets['tweets'] = idAndTweets['tweets'].str.replace(\"[^a-zA-Z#]\", \" \")\n",
    "idAndTweets.head(10)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/francischen/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "/Users/francischen/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py:3607: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  self._set_item(key, value)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tweets</th>\n",
       "      <th>age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>_____zac_____</td>\n",
       "      <td>The owner drip even mill drip company aint got...</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>___aleia</td>\n",
       "      <td>I talked girl since sophomore year I slid stor...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>___schaeffer___</td>\n",
       "      <td>congrats sis keep workin fs spending memorial ...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>__EmilyRice__</td>\n",
       "      <td>yes come san marcos live see next week square ...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>__ginaaaa__</td>\n",
       "      <td>small Go get ready dinner JACK related I need ...</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                id                                             tweets  age\n",
       "0    _____zac_____  The owner drip even mill drip company aint got...  1.0\n",
       "1         ___aleia  I talked girl since sophomore year I slid stor...  0.0\n",
       "2  ___schaeffer___  congrats sis keep workin fs spending memorial ...  0.0\n",
       "3    __EmilyRice__  yes come san marcos live see next week square ...  0.0\n",
       "4      __ginaaaa__  small Go get ready dinner JACK related I need ...  0.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#remove stop words\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "nltk.download('stopwords')\n",
    "stop_words = set(stopwords.words('english'))\n",
    "idAndTweets['tweets'] = idAndTweets['tweets'].apply(\n",
    "    lambda x: ' '.join([w for w in x.split() if w not in stop_words]))\n",
    "idAndTweets.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                id                                             tweets  age\n",
      "0    _____zac_____  The owner drip even mill drip company aint got...  1.0\n",
      "1         ___aleia  I talked girl since sophomore year I slid stor...  0.0\n",
      "2  ___schaeffer___  congrats sis keep workin fs spending memorial ...  0.0\n",
      "3    __EmilyRice__  yes come san marcos live see next week square ...  0.0\n",
      "4      __ginaaaa__  small Go get ready dinner JACK related I need ...  0.0\n",
      "2940\n"
     ]
    }
   ],
   "source": [
    "print(idAndTweets.head())\n",
    "print(idAndTweets.size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TF-IDF vectorizer & Logistic Regression, Oversampling, 5-fold cross validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from imblearn.over_sampling import SMOTE\n",
    "tfidf = TfidfVectorizer(max_features=7500, ngram_range=(1, 2))\n",
    "X = tfidf.fit_transform(idAndTweets['tweets'])\n",
    "y = idAndTweets['age']\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=1/4.0, random_state=0)\n",
    "oversample = SMOTE()\n",
    "X_train, y_train = oversample.fit_resample(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# K-fold (5-fold)\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from imblearn.over_sampling import SMOTE\n",
    "tfidf = TfidfVectorizer(max_features=7500, ngram_range=(1, 2))\n",
    "\n",
    "X = tfidf.fit_transform(idAndTweets['tweets'])\n",
    "y = idAndTweets['age']\n",
    "\n",
    "kf = KFold(n_splits=5)\n",
    "kf.get_n_splits(X)\n",
    "\n",
    "for train_index, test_index in kf.split(X):\n",
    "    #print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index], X[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "\n",
    "# Oversampling\n",
    "oversample = SMOTE()\n",
    "X_train, y_train = oversample.fit_resample(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Logistic Regression\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "log = LogisticRegression(multi_class='multinomial', solver='lbfgs').fit(X_train, y_train)\n",
    "y_pred = log.predict(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.77      0.91      0.83       129\n",
      "         1.0       0.74      0.46      0.57        67\n",
      "\n",
      "    accuracy                           0.76       196\n",
      "   macro avg       0.75      0.69      0.70       196\n",
      "weighted avg       0.76      0.76      0.74       196\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(classification_report(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Complement Naive Bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import RegexpTokenizer\n",
    "\n",
    "training_dataset = idAndTweets.sample(frac=0.7, ignore_index=True)\n",
    "training_dataset.head()\n",
    "test_dataset = idAndTweets.drop(training_dataset.index)\n",
    "\n",
    "token = RegexpTokenizer(r'[a-zA-Z0-9]+')\n",
    "vec = CountVectorizer(stop_words='english', ngram_range=(1, 3), tokenizer=token. tokenize)\n",
    "\n",
    "x_train = vec.fit_transform(training_dataset['tweets'].values)\n",
    "y_train = training_dataset['age']\n",
    "x_test = vec.transform(test_dataset['tweets'])\n",
    "y_test= test_dataset['age']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ComplementNB()"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.naive_bayes import ComplementNB\n",
    "model = ComplementNB()\n",
    "model.fit(x_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9985422740524781"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.score(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9251700680272109"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.score(x_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.90      1.00      0.95       202\n",
      "         1.0       1.00      0.76      0.86        92\n",
      "\n",
      "    accuracy                           0.93       294\n",
      "   macro avg       0.95      0.88      0.91       294\n",
      "weighted avg       0.93      0.93      0.92       294\n",
      "\n"
     ]
    }
   ],
   "source": [
    "y_prediction = model.predict(x_test)\n",
    "print(classification_report(y_test,y_prediction))"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "feb31ced21f8554558e77c29e0ccd8c26567d80dfb262a466b58f961fe2079d9"
  },
  "kernelspec": {
   "display_name": "Python 3.8.8 64-bit ('base': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
